# Commands for text analysis of the InformationSearch responses

setwd("~/Dropbox/Information and Predictions/STM")
library(stm)

#following packages needed
#install.packages("igraph")
#install.packages("wordcloud")
#install.packages("Rtsne")
#install.packages("rsvd")
#install.packages("geometry")

#Read in the dataset
data<-read.csv("info_responses.csv")

data<-within(data, treatment <-relevel(treatment, ref="control"))

# Process the text data using features in  STM
processed <- textProcessor(data$text, metadata = data)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 20)

docs <- out$documents
vocab <-out$vocab
meta <- out$meta


## Choose number of word prevalence threshold 
plotRemoved(processed$documents, lower.thresh = seq(1, 200, by = 100))


#Search across a number of topics
storage<-searchK(out$documents, out$vocab, K= c(3, 5, 7, 10), prevalence=~treatment, data=meta)

plot.searchK(storage)

##Fit STM model, w/ k=7  topics
infoPrevFit <- stm(documents= out$documents, vocab = out$vocab, K=7, prevalence=~treatment, max.em.its = 250, data=out$meta, init.type = "Spectral")
exclusivity(infoPrevFit)
corMat<-topicCorr(infoPrevFit, method = c("simple"))
plot(corMat)

#plot prevalence of the topics
plot(infoPrevFit, type= "summary", xlim = c(0, 1.0))

#WordClouds for each topic
pdf("topic1.pdf")
cloud(infoPrevFit, topic=1)
dev.off()

pdf("topic2.pdf")
cloud(infoPrevFit, topic=2)
dev.off()

pdf("topic3.pdf")
cloud(infoPrevFit, topic=3)
dev.off()

pdf("topic4.pdf")
cloud(infoPrevFit, topic=4)
dev.off()
pdf("topic5.pdf")
cloud(infoPrevFit, topic=5)
dev.off()
pdf("topic6.pdf")
cloud(infoPrevFit, topic=6)
dev.off()
pdf("topic7.pdf")
cloud(infoPrevFit, topic=7)
dev.off()


#Estimate Difference in Topics Based on Treatment
prep <- estimateEffect(1:7 ~ treatment, infoPrevFit, meta = out$meta, uncertainty ="Global")
summary(prep, 1:7, nsim=500)

#plot the difference in topic prevalence for control and bonus
plot (prep, covariate = "treatment", topics = c(1:7), model = infoPrevFit, method = "difference", cov.value1="bonus", cov.value2 = "control", xlab = "Prevalence in Bonus Compared to Control", labeltype = "custom", custom.labels = c('1', '2', '3', '4', '5', '6', '7', '8'))

#plot the difference in topic prevalence for control and random bonus
plot (prep, covariate = "treatment", topics = c(1:7), model = infoPrevFit, method = "difference", cov.value1="random bonus", cov.value2 = "control", xlab = "Prevalence in Random Bonus Compared to Control", labeltype = "custom", custom.labels = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10' ))

#plot the difference in topic prevalence for control and lottery
plot (prep, covariate = "treatment", topics = c(1:7), model = infoPrevFit, method = "difference", cov.value1="lottery", cov.value2 = "control", xlab = "Prevalence in Lottery Compared to Control", labeltype = "custom", custom.labels = c('thought', 'predict', 'guess', 'news', 'right', 'answer', 'inform'),width=70)






